library(ggplot2)
library(data.table)
library(scales)
library(RColorBrewer)
library(reticulate)
library(uwot)
## Loading required package: Matrix
library(gridExtra)
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: nlme
## Loading required package: rpart
##
## spatstat 1.63-3 (nickname: 'Wet paint')
## For an introduction to spatstat, type 'beginner'
##
## Attaching package: 'spatstat'
## The following object is masked from 'package:scales':
##
## rescale
## The following object is masked from 'package:data.table':
##
## shift
library(ggdendro)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
use_condaenv(condaenv = 'r-reticulate', required = TRUE)
data_dir <- here::here('..','data')
load(file.path(data_dir, 'diff.sampled.Rdata'))
diff_dt <- fread(file.path(data_dir, 'diff.sampled.csv.gz'))
censor_negative_min = -1500
redo_umap_clustering = F
redo_umap_param_search = F
#only redo param search if doing clustering also:
redo_umap_clustering = redo_umap_clustering && redo_umap_clustering
# map day 0 to both plus and minus
map_day_0 <- function(df) {
return(rbind(
df[k562!='none'],
df[k562=='none'][, k562 := 'cd19+'],
df[k562=='none'][, k562 := 'cd19-']
))
}
For now, skip straight to high dimensional plotting.
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map except cd4/8
umap_vars <- c(paste0(names(channel_map),'_t'))
umap_vars <- umap_vars[umap_vars != 'cd_t']
sample_for_umap <- function(df, sample_size) {
umap_dt <- diff_dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(sample_size, length(unique(event_id))))],
by=c('well', 'plate', 'day')]
}
cast_for_umap <- function(df) {
#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- dcast(df,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
return(umap_cast_dt)
}
scale_for_umap <- function(df, umap_vars, censor_negative_min) {
umap_dt_in <- df[, ..umap_vars]
# for points that are very negative, trim the to below the cutoff
umap_dt_in[umap_dt_in < censor_negative_min] <- censor_negative_min
# scale each input column
umap_dt_in[ ,
(names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
return(umap_dt_in)
}
# check scales:
# ggplot(melt(cbind(umap_dt_in, id=1:nrow(umap_dt_in)), id.var='id'),
# aes(x=value)) +
# geom_density() +
# facet_grid(variable~.) +
# scale_fill_distiller(palette='RdYlBu') +
# theme_bw()
First, find parameters with a small sample.
# use umap canned functions
umap_dt <- sample_for_umap(diff_dt, 20)
umap_cast_dt <- cast_for_umap(umap_dt)
umap_dt_in <- scale_for_umap(umap_cast_dt, umap_vars, censor_negative_min)
### Run across parameter list
#umap parameter list
min_dist_set <- c(0.0001, 0.005, 0.1)
n_neighbor_set <- c(3,6,15,30)
umap_params <- data.table(expand.grid(min_dist_set, n_neighbor_set))
# run umap via uwot library
umap_out <- umap_params[, {
print(paste0('Running: ',.BY));
as.data.table(umap(umap_dt_in, min_dist=as.numeric(.BY[1]),
n_neighbors=as.numeric(.BY[2]), verbose=F, n_threads=8, n_trees=50))
}, by=names(umap_params)]
# rename the columns
names(umap_out) <- c('min_dist','n_neighbor','umap_1','umap_2')
# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]
umap_fcs_dt <- unique(umap_dt[,
.(donor, car, k562, t_type, day, well, event_id)])[
umap_fcs_dt, on=.(t_type,well,day,event_id)]
color_points <- ggplot(umap_fcs_dt[car %in% c('CD28','41BB','KLRG1','BAFF-R','Zeta')],
aes(x=umap_1, y=umap_2, color=interaction(car, t_type))) +
geom_point(size=0.1, alpha=0.1) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=3))) +
facet_grid(min_dist~n_neighbor) +
theme_bw()
color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
geom_hex(bins = 70) +
scale_fill_continuous(
type = "viridis", limits=c(0,30), oob=scales::squish) +
facet_grid(min_dist~n_neighbor) +
theme_bw()
grid.arrange(color_points, color_density, ncol=2)
Choosing neighbors == 15 and min_dist == 1e-4.
Scaling this to 200 events per well and checking CAR/condition separation.
chosen_dist=0.005
chosen_n_neighbor=15
umap_dt <- diff_dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(200, length(unique(event_id))))],
by=c('well', 'plate', 'day')]
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map
umap_vars <- c(paste0(names(channel_map),'_t'))
#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- dcast(umap_dt,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
umap_dt_in <- umap_cast_dt[, ..umap_vars]
# scale each input column
umap_dt_in[ ,
(names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
# run umap via uwot library
umap_out <- umap(umap_dt_in, min_dist=chosen_dist,
n_neighbors=chosen_n_neighbor, verbose=T, n_threads=8, n_trees=50,
ret_nn=T)
# nearest neighbors in original space
nearest_neighbors <- umap_out$nn$euclidean$idx
neighbor_dist <- umap_out$nn$euclidean$dist
edge_weights <- scales::rescale(-neighbor_dist, to=c(0,1))
edge_weights <- edge_weights - min(edge_weights)
umap_out <- data.table(umap_out$embedding)
#nearest neighbors in embedding
umap_nn <- cbind(1:nrow(umap_fcs_dt),
nnwhich(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_nd <- cbind(rep(0, nrow(umap_fcs_dt)),
nndist(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_ew <- scales::rescale(-umap_nd, to=c(0,1))
umap_ew <- umap_ew - min(umap_ew)
# rename the columns
names(umap_out) <- c('umap_1','umap_2')
# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]
# add back the annotations
umap_fcs_dt <- unique(umap_dt[,
.(donor, car, k562, t_type, day, well, event_id)])[
umap_fcs_dt, on=.(t_type,well,day,event_id)]
cd4_colors = brewer.pal('Greens', n=9)[c(2,4,6,8,9)]
cd8_colors = brewer.pal('Oranges', n=9)[c(2,4,6,8,9)]
color_time <- ggplot(umap_fcs_dt[][, cd19 := (k562=='cd19+')],
aes(x=umap_1, y=umap_2, color=interaction(day, t_type))) +
geom_point(size=0.1, alpha=0.5) +
facet_grid(car~cd19) +
scale_color_manual(values=c(cd4_colors, cd8_colors)) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=3)))
color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
geom_hex(bins = 70) +
facet_grid(car~cd19) +
scale_fill_continuous(
type = "viridis", limits=c(0,30), oob=scales::squish) +
theme_bw()
color_markers <- ggplot(
melt(umap_fcs_dt, measure.vars=umap_vars)[,
value.scaled := scale(value), by='variable'][,
cd19 := (k562=='cd19+')],
aes(x=umap_1, y=umap_2, z=value.scaled)) +
stat_summary_hex(bins = 70) +
facet_grid(variable~cd19) +
scale_fill_distiller(palette='RdYlBu', limits=c(-3, 3), oob=scales::squish) +
theme_bw()
grid.arrange(color_time, color_density, color_markers, ncol=3)
import leidenalg
import igraph as ig
import pandas as pd
import numpy as np
import math
edges = []
n_neighbors = r.nearest_neighbors.shape[1]
for i, nn_row in enumerate(r.nearest_neighbors):
edge_weights = r.edge_weights[i]
for j in range(1, n_neighbors):
edges.append((int(nn_row[0]), int(nn_row[j]), edge_weights[j]))
knn_graph = ig.Graph.TupleList(edges, directed=True, weights=True)
part = leidenalg.find_partition(knn_graph,
leidenalg.ModularityVertexPartition, weights='weight')
#part = leidenalg.find_partition(knn_graph,
# leidenalg.CPMVertexPartition, weights='weight',
# resolution_parameter=0.05)
cluster_membership = [x for _,x in sorted(zip(knn_graph.vs['name'],part.membership))]
umap_fcs_dt[, cluster := factor(py$cluster_membership+1)]
# save the clustering data
fwrite(umap_fcs_dt,
compress='gzip',
file=file.path(here::here('..','data','diff.sampled.umap.csv.gz')))
# sync output back to s3
system('aws s3 sync \\
--exclude "*" \\
--include "*.Rdata" \\
--include "*.csv*" \\
~/local-tcsl/data s3://roybal-tcsl//lenti_screen_compiled_data/data')
One issue with these clusters is that they are not contiguous in the UMAP embedding:
umap_fcs_dt <- fread(
file.path(
here::here('..','data','diff.sampled.umap.csv.gz')))
umap_fcs_dt[, cluster := factor(cluster)]
umap_cluster_dt <- umap_fcs_dt[, list(
mean_umap_1= mean(umap_1),
mean_umap_2= mean(umap_2),
size= .N), by=cluster]
ggplot() +
geom_point(data=umap_fcs_dt,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=umap_cluster_dt, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
theme_minimal()
ggplot() +
geom_point(data=umap_fcs_dt,
aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
geom_label(data=umap_cluster_dt, aes(
x=mean_umap_1, y=mean_umap_2,
label=paste(cluster,size,sep='\n'),
color=cluster), alpha=0.3) +
facet_wrap(~cluster) +
theme_bw()
cluster_pct_vars <- c(paste0(
names(channel_map),'_t'),
'FSC-A','SSC-A','donor')
max_cluster <- max(as.numeric(umap_fcs_dt$cluster), na.rm=T)
umap_var_melt_dt <-melt(
umap_fcs_dt[!is.na(donor)], measure.vars= c(umap_vars,'donor'))
## Warning in melt.data.table(umap_fcs_dt[!is.na(donor)], measure.vars =
## c(umap_vars, : 'measure.vars' [ccr7_t, gfp_t, zombie_t, myc_t, ...] are not all
## of the same type. By order of hierarchy, the molten data value column will be of
## type 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
# Use z scale to plot params
umap_var_melt_dt[, value_scaled := scale(value), by=variable]
param_cluster_pct_dt <- umap_var_melt_dt[
!is.na(cluster) & variable %in% cluster_pct_vars,
list(
clust_mean = mean(value_scaled),
clust_sd = sd(value_scaled)),
by=c('variable','cluster')]
# Make dendrogram
cluster_mean_cast <- dcast(
param_cluster_pct_dt,
variable ~ cluster,
value.var='clust_mean')
cluster_dendro_m <- as.matrix(cluster_mean_cast[, -c(1)])
rownames(cluster_dendro_m) <- unlist(cluster_mean_cast[,1])
dendro_clusters <- as.dendrogram(hclust(d = dist(x = t(cluster_dendro_m))))
dendro_vars <- as.dendrogram(hclust(d = dist(x = cluster_dendro_m)))
# Create dendrogram plot
dendro_vars_plot <- ggdendrogram(data = dendro_vars, rotate = TRUE) +
theme(axis.text.y = element_text(size = 6))
dendro_clusters_plot <- ggdendrogram(data = dendro_clusters, rotate = TRUE) +
theme(axis.text.y = element_text(size = 6))
# column order
cluster_order <- order.dendrogram(dendro_clusters)
cluster_names <- colnames(cluster_dendro_m[, cluster_order])
param_cluster_pct_dt[, cluster:= factor(cluster, levels = cluster_names)]
# row order
var_order <- order.dendrogram(dendro_vars)
var_names <- row.names(cluster_dendro_m[var_order, ])
param_cluster_pct_dt$variable <- factor(
param_cluster_pct_dt$variable,
levels = var_names)
# heatmap
var_heatmap <- ggplot(param_cluster_pct_dt) +
geom_tile(aes(x=cluster, y=variable, fill=clust_mean), color='black') +
scale_fill_distiller(palette='PiYG', limits=c(-3,3), oob=scales::squish) +
theme_minimal()
# col dendro
dendro_data_col <- dendro_data(dendro_clusters, type = "rectangle")
dendro_col <- axis_canvas(var_heatmap, axis = "x") +
geom_segment(data = segment(dendro_data_col),
aes(x = x, y = y, xend = xend, yend = yend))
plot_dendroheat <- insert_xaxis_grob(var_heatmap,
dendro_col, grid::unit(0.2, "null"), position = "top")
ggdraw(plot_dendroheat)
## Warning: Removed 5 rows containing missing values (geom_text).